Lots of code, so we'll just skim through this.
January 22, 2016
Lots of code, so we'll just skim through this.
df_from_url = function(url, ncol, number_columns) {
json = fromJSON(file = url, method = "C")
df = data.frame(matrix(unlist(json$resultSets[[1]][[3]]),
ncol = ncol, byrow = TRUE), stringsAsFactors = FALSE)
colnames(df) = json$resultSets[[1]][[2]]
df[, number_columns] = apply(df[, number_columns],
2, function(x) as.numeric(as.character(x)))
return(df)
}
years = sapply(2007:2015, function(year) sprintf("%4d-%02d",
year, (year + 1)%%100))
team_fmt = "http://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Plays&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=%s&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision="
team_urls = sapply(years, function(year) sprintf(team_fmt,
year))
team_dfs = sapply(team_urls, function(url) df_from_url(url,
30, c(1, 3:29)))
team_ids = Reduce(union, team_dfs[1, ])
columns = c(35, 32, 24, 27, 30)
stat_types = c("Base", "Advanced", "Misc", "Scoring",
"Usage")
player_fmt = "http://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=%s&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Plays&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=%s&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight="
players = NULL
for (year in years) {
season_df = NULL
for (i in 1:length(stat_types)) {
stat_type = stat_types[i]
c = columns[i]
numeric_columns = c(1,3,5:(c-1))
url = sprintf(player_fmt, stat_type, year)
df = df_from_url(url, c, numeric_columns)
if (is.null(season_df)) {
season_df = df
} else {
season_df = merge(
season_df, df, by=1, all.x=TRUE, suffixes=c('', sprintf('_%s', stat_type))
)
}
}
season_df$SEASON = factor(year)
if (is.null(players)) {
players = season_df
} else {
players = rbind(players, season_df)
}
}
stat_types = c("Base", "Advanced", "Four+Factors",
"Misc", "Scoring", "Opponent")
columns = c(31, 24, 18, 18, 25, 31)
lineup_fmt = "http://stats.nba.com/stats/leaguedashlineups?Conference=&DateFrom=&DateTo=&Division=&GameID=&GameSegment=&GroupQuantity=5&LastNGames=0&LeagueID=00&Location=&MeasureType=%s&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Plays&Period=0&PlusMinus=N&Rank=N&Season=%s&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&TeamID=%d&VsConference=&VsDivision="
lineups = NULL
for (year in years) {
for (team in team_ids) {
season_df = NULL
for (i in 1:length(stat_types)) {
stat_type = stat_types[i]
c = columns[i]
numeric_columns = c(4,6:c)
url = sprintf(lineup_fmt, stat_type, year, team)
df = df_from_url(url, c, numeric_columns)
if (is.null(season_df)) {
season_df = df
} else {
season_df = merge(
season_df, df, by=2, all.x=TRUE, suffixes=c('', sprintf('_%s', stat_type))
)
}
}
season_df$SEASON = factor(year)
if (is.null(lineups)) {
lineups = season_df
} else {
lineups = rbind(lineups, season_df)
}
}
}
players = mutate(players, MIN_TOTAL = MIN_Usage, MIN_GAME = MIN_Advanced)
players = select(players, -X, -matches("CFID|CFPARAMS|_[A-Z][a-z]",
FALSE))
lineups = tbl_df(lineups)
lineups = mutate(lineups, MIN_TOTAL = MIN_Advanced)
lineups = select(lineups, -X, -matches("GROUP_SET|CFID|CFPARAMS|_[A-Z][a-z]",
FALSE))
lineups = Filter(function(x) !all(is.na(x)), lineups)
lineups$PLAYERS = t(sapply(lineups$GROUP_ID, function(x) {
as.integer(unlist(strsplit(as.character(x), split = " - ")))
}))
season_col = grep("SEASON", colnames(lineups))
player_col = grep("PLAYERS", colnames(lineups))
numeric_player_columns = as.vector(which(sapply(players,
is.numeric)))
lineup_averages = data.frame(t(apply(lineups, 1, function(x) {
srows = players$SEASON == x[season_col]
prows = players$PLAYER_ID %in% as.numeric(x[player_col:player_col +
4])
sapply(players[srows & prows, numeric_player_columns],
mean)
})))
usg_weighted = data.frame(t(apply(lineups, 1, function(x) {
srows = players$SEASON == x[season_col]
prows = players$PLAYER_ID %in% as.numeric(x[player_col:player_col +
4])
stats = players[srows & prows, numeric_player_columns]
tot_usg = sum(stats$USG_PCT_PCT)
sapply(stats, function(y) sum(y * stats$USG_PCT_PCT)/tot_usg)
})))
colnames(lineups) = paste(names(lineups), "lineup",
sep = ".")
colnames(lineup_averages) = paste(names(lineup_averages),
"player", sep = ".")
colnames(usg_weighted) = paste(names(usg_weighted),
"usage", sep = ".")
nba = merge(merge(lineups, lineup_averages, by = 0),
usg_weighted, by = 0)
nba$Row.names = NULL
nba$Row.names = NULL
dim(nba)
## [1] 66784 259
mins_ok = nba$MIN_TOTAL.lineup > 4
ratio_ok = nba$MIN_TOTAL.lineup/nba$MIN_TOTAL.player <=
0.1
lineup_ok = nba$NET_RATING.lineup > -50 & nba$NET_RATING.lineup <
50
player_ok = nba$NET_RATING.player > -10 & nba$NET_RATING.player <
10
nba = nba[mins_ok & ratio_ok & lineup_ok & player_ok,
]
I'm not sure any of this would stand up to statistical scrutiny, but it should be okay for drawing pretty pictures.
| GROUP_ID.lineup | SEASON.lineup | PTS.lineup | PTS.player | PTS.usage |
|---|---|---|---|---|
| 2551 - 101122 - 101127 - 201177 - 2211 | 2008-09 | 106.1 | 18.3 | 4.5 |
| 201589 - 200768 - 201564 - 2545 - 2624 | 2008-09 | 90.3 | 10.6 | 16.7 |
| 201147 - 2744 - 201567 - 201196 - 2863 | 2009-10 | 97.6 | 12.4 | 14.9 |
| 201147 - 2744 - 2545 - 201196 - 2863 | 2009-10 | 85.8 | 12.4 | 14.9 |
| 201567 - 2545 - 2562 - 201196 - 2863 | 2009-10 | 113.4 | 12.4 | 17.3 |
| 201605 - 2562 - 200762 - 201196 - 2863 | 2009-10 | 89.2 | 12.4 | 17.1 |
.lineup indicates stats for a given lineup..player indicates the average (full-season)stats for the players in a lineup..usage indicates the usage-weighted (sull-season) stats for the players in a lineup.